In [61]:
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from dotenv import load_dotenv
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import shapiro, anderson, kstest, normaltest
from scipy.signal import savgol_filter
from sklearn.neighbors import KernelDensity
import streamlit as st
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer
import scikit_posthocs as sp
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from tabulate import tabulate
from scipy.stats import chi2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from pingouin import multivariate_normality
from pygam import LogisticGAM, s, f
from imblearn.over_sampling import ADASYN
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split


import networkx as nx
from community import community_louvain
import numpy as np

## Helper functions
from helper_functions import find_best_hyperparameters, permutation_test_logistic, mahalanobis_distance,fit_and_predict,  cool_plotting, bootstrap_ci, kde_density, plot_confusion_matrix, calculateClasswiseTopNAccuracy, calculateMetrics, calculateMetricsAndPrint
################################## You can change values inside the following list ###########################
topNValues = [10,20,30,40,50,60,70,80,90]
##############################################################################################################

Env¶

In [3]:
load_dotenv()
os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")
api = KaggleApi()
api.authenticate()
dataset_name = "fedesoriano/stroke-prediction-dataset"
api.dataset_download_files(dataset_name, path=os.getcwd(), unzip=True)
Dataset URL: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
In [4]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()
Out[4]:
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1

EDA¶

In [5]:
print(df.isna().sum())
df.dropna(inplace = True)
df = df[df.gender !="Other"]
df.reset_index(inplace = True,drop=True)
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
In [6]:
sns.pairplot(df, hue="stroke")
plt.show()
No description has been provided for this image
In [7]:
sns.boxplot(data=df, x="stroke", y="bmi", palette="Set2",legend=False, hue = "stroke")
plt.title("BMI by Stroke Status")
plt.show()
No description has been provided for this image
In [10]:
fig = plt.figure(figsize=(20, 12), facecolor='white')
ax = [None for _ in range(3)]
gs = fig.add_gridspec(3, 1)
gs.update(wspace=0, hspace=0.8)

# Titles and text
ax[0] = fig.add_subplot(gs[0, 0])
ax[1] = fig.add_subplot(gs[1, 0])
ax[2] = fig.add_subplot(gs[2, 0])

# Relationship between age and stroke
ax[0].text(-20, 0.04, 'Relationship between age and stroke', fontsize=21, fontweight='bold', fontfamily='monospace')
ax[0].text(-20, 0.035, 'The older a person is, the more likely he/she has a stroke', fontsize=16, fontweight='light', fontfamily='monospace')

# Relationship between average glucose level and stroke
ax[1].text(-30, 0.023, 'Relationship between average glucose level and stroke', fontsize=21, fontweight='bold', fontfamily='monospace')
ax[1].text(-30, 0.0207, 'No clear relationship between avg_glucose_level and stroke', fontsize=16, fontweight='light', fontfamily='monospace')

# Relationship between BMI and stroke
ax[2].text(0, 0.09053, 'Relationship between BMI and stroke', fontsize=21, fontweight='bold', fontfamily='monospace')

# Plot KDE for stroke=1 (red) and stroke=0 (green) for age
sns.kdeplot(data=df[df.stroke == 1], x='age', ax=ax[0], fill=True, color='lightcoral', alpha=1)
sns.kdeplot(data=df[df.stroke == 0], x='age', ax=ax[0], fill=True, color='palegreen', alpha=0.5)

# Plot KDE for stroke=1 (red) and stroke=0 (green) for avg_glucose_level
sns.kdeplot(data=df[df.stroke == 1], x='avg_glucose_level', ax=ax[1], fill=True, color='lightcoral', alpha=1)
sns.kdeplot(data=df[df.stroke == 0], x='avg_glucose_level', ax=ax[1], fill=True, color='palegreen', alpha=0.5)

# Plot KDE for stroke=1 (red) and stroke=0 (green) for BMI
sns.kdeplot(data=df[df.stroke == 1], x='bmi', ax=ax[2], fill=True, color='lightcoral', alpha=1)
sns.kdeplot(data=df[df.stroke == 0], x='bmi', ax=ax[2], fill=True, color='palegreen', alpha=0.5)

for i in range(3):
    ax[i].set_yticklabels('')
    ax[i].set_ylabel('')
    ax[i].tick_params(axis='y', length=0)
    for direction in ['top', 'right', 'left']:
        ax[i].spines[direction].set_visible(False)

plt.show()
No description has been provided for this image
In [11]:
fig = plt.figure(figsize=(10, 5), dpi=150, facecolor='white')
gs = fig.add_gridspec(2, 1)
gs.update(wspace=0.11, hspace=0.5)
ax0 = fig.add_subplot(gs[0, 0])
ax0.set_facecolor('white')

df['age'] = df['age'].astype(int)

# Calculate risk rate by age
rate = []
for i in range(df['age'].min(), df['age'].max()):
    subset = df[df['age'] < i]
    if len(subset) > 0:  # Avoid division by zero
        rate.append(subset['stroke'].sum() / len(subset['stroke']))
    else:
        rate.append(0)  # Assign 0 risk if no data points exist

# Apply Savitzky-Golay filter to smooth the rate curve
smoothed_rate = savgol_filter(rate, window_length=11, polyorder=2)
sns.lineplot(data=smoothed_rate, color='#0f4c81', ax=ax0)
for s in ["top", "right", "left"]:
    ax0.spines[s].set_visible(False)

ax0.tick_params(axis='both', which='major', labelsize=8)
ax0.tick_params(axis=u'both', which=u'both', length=0)
ax0.text(-3, 0.055, 'Risk Increase by Age', fontsize=18, fontfamily='serif', fontweight='bold')
ax0.text(-3, 0.047, 'As age increases, so does the risk of having a stroke', fontsize=14, fontfamily='serif')
plt.show()
No description has been provided for this image

Statistical Testing¶

In [12]:
differences, ci_lower, ci_upper = bootstrap_ci(df, "stroke", "age", group1=1, group2=0,stat_fn=np.median, n_iterations=1000, alpha=0.05)
plt.hist(differences, bins=30, color="skyblue", edgecolor="black")
plt.axvline(ci_lower, color="red", linestyle="--", label=f"95% CI Lower: {ci_lower:.2f}")
plt.axvline(ci_upper, color="green", linestyle="--", label=f"95% CI Upper: {ci_upper:.2f}")
plt.title("Bootstrap CI for Median Difference (bmi)")
plt.xlabel("Difference in Medians")
plt.ylabel("Frequency")
plt.legend()
plt.show()
No description has been provided for this image
In [13]:
differences, ci_lower, ci_upper = bootstrap_ci(df, "stroke", "bmi", group1=1, group2=0,stat_fn=np.median, n_iterations=1000, alpha=0.05)
plt.hist(differences, bins=30, color="skyblue", edgecolor="black")
plt.axvline(ci_lower, color="red", linestyle="--", label=f"95% CI Lower: {ci_lower:.2f}")
plt.axvline(ci_upper, color="green", linestyle="--", label=f"95% CI Upper: {ci_upper:.2f}")
plt.title("Bootstrap CI for Median Difference (bmi)")
plt.xlabel("Difference in Medians")
plt.ylabel("Frequency")
plt.legend()
plt.show()
No description has been provided for this image
In [14]:
## BOOTSTRAP CI
differences, ci_lower, ci_upper = bootstrap_ci(df, "stroke", "avg_glucose_level", group1=1, group2=0,stat_fn=np.median, n_iterations=1000, alpha=0.05)
plt.hist(differences, bins=30, color="skyblue", edgecolor="black")
plt.axvline(ci_lower, color="red", linestyle="--", label=f"95% CI Lower: {ci_lower:.2f}")
plt.axvline(ci_upper, color="green", linestyle="--", label=f"95% CI Upper: {ci_upper:.2f}")
plt.title("Bootstrap CI for Median Difference (avg_glucose_level)")
plt.xlabel("Difference in Medians")
plt.ylabel("Frequency")
plt.legend()
plt.show()
No description has been provided for this image

Proving need for nonparametric methods:¶

lets prove that whatever we do we cant make them normal --> we will have to use some form of nonparam approach !! logistic regression doe snot assume notmality

In [15]:
scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True)
In [16]:
# numerical features
numerical_cols = ['age', 'avg_glucose_level', 'bmi']
data = df[numerical_cols].dropna()
In [17]:
data_scaled_1 = scaler.fit_transform(data)
In [18]:
alpha = 0.05
num_tests = len(numerical_cols) * 4  # 3 features * 4 tests per feature
adjusted_alpha = alpha / num_tests  # Bonferroni correction

scalers = {"PowerTransformer_YeoJohnson": data_scaled_1}

for scaler_name, scaled_data in scalers.items():
    print(f"=== Testing Normality for {scaler_name} ===")
    for i, col in enumerate(numerical_cols):
        print(f"\nFeature: {col}")

        # Shapiro-Wilk Test
        stat, p = shapiro(scaled_data[:, i])
        print(f"Shapiro-Wilk: W={stat:.4f}, p={p:.4f} (Bonferroni adj: {adjusted_alpha:.5f}) {'REJECT' if p < adjusted_alpha else 'FAIL TO REJECT'}")

        # Kolmogorov-Smirnov Test
        stat, p = kstest(scaled_data[:, i], 'norm')
        print(f"Kolmogorov-Smirnov: D={stat:.4f}, p={p:.4f} (Bonferroni adj: {adjusted_alpha:.5f}) {'REJECT' if p < adjusted_alpha else 'FAIL TO REJECT'}")

        # Anderson-Darling Test
        ad_test = anderson(scaled_data[:, i], dist='norm')
        print(f"Anderson-Darling: A^2={ad_test.statistic:.4f}, Critical Values={ad_test.critical_values}")

        # D'Agostino & Pearson Test
        stat, p = normaltest(scaled_data[:, i])
        print(f"D'Agostino & Pearson: K^2={stat:.4f}, p={p:.4f} (Bonferroni adj: {adjusted_alpha:.5f}) {'REJECT' if p < adjusted_alpha else 'FAIL TO REJECT'}")

    # Multivariate Normality Test
    print(f"\n=== Multivariate Normality Test for {scaler_name} ===")
    mardia_test = multivariate_normality(pd.DataFrame(scaled_data, columns=numerical_cols), alpha=adjusted_alpha)
    print(mardia_test)
=== Testing Normality for PowerTransformer_YeoJohnson ===

Feature: age
Shapiro-Wilk: W=0.9667, p=0.0000 (Bonferroni adj: 0.00417) REJECT
Kolmogorov-Smirnov: D=0.9614, p=0.0000 (Bonferroni adj: 0.00417) REJECT
Anderson-Darling: A^2=35.7757, Critical Values=[0.576 0.655 0.786 0.917 1.091]
D'Agostino & Pearson: K^2=642.5291, p=0.0000 (Bonferroni adj: 0.00417) REJECT

Feature: avg_glucose_level
Shapiro-Wilk: W=0.9829, p=0.0000 (Bonferroni adj: 0.00417) REJECT
Kolmogorov-Smirnov: D=0.8207, p=0.0000 (Bonferroni adj: 0.00417) REJECT
Anderson-Darling: A^2=14.7962, Critical Values=[0.576 0.655 0.786 0.917 1.091]
D'Agostino & Pearson: K^2=86.6995, p=0.0000 (Bonferroni adj: 0.00417) REJECT

Feature: bmi
Shapiro-Wilk: W=0.9982, p=0.0000 (Bonferroni adj: 0.00417) REJECT
Kolmogorov-Smirnov: D=0.9928, p=0.0000 (Bonferroni adj: 0.00417) REJECT
Anderson-Darling: A^2=2.4938, Critical Values=[0.576 0.655 0.786 0.917 1.091]
D'Agostino & Pearson: K^2=8.7221, p=0.0128 (Bonferroni adj: 0.00417) FAIL TO REJECT

=== Multivariate Normality Test for PowerTransformer_YeoJohnson ===
HZResults(hz=np.float64(14.196748067407647), pval=np.float64(3.0522060333347687e-140), normal=False)
In [19]:
for scaler_name, scaled_data in scalers.items():
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    fig.suptitle(f"Histogram & KDE for {scaler_name}")
    for i, col in enumerate(numerical_cols):
        sns.histplot(scaled_data[:, i], kde=True, ax=axes[i])
        axes[i].set_title(f"{col}")
    plt.show()
No description has been provided for this image
In [20]:
for scaler_name, scaled_data in scalers.items():
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    fig.suptitle(f"Q-Q Plots for {scaler_name}")
    for i, col in enumerate(numerical_cols):
        stats.probplot(scaled_data[:, i], dist="norm", plot=axes[i])
        axes[i].set_title(f"{col}")
    plt.show()
No description has been provided for this image
In [21]:
# we could make some but not all normal

Grouping¶

In [22]:
numerical = ['bmi', 'age',"avg_glucose_level"]
categorical =['gender','hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'smoking_status', "stroke"]
In [23]:
numerical_df = df[numerical]
categorical_df = df[categorical]
data_no_stroke = categorical_df.drop(columns=["stroke"])
In [24]:
data_encoded = pd.get_dummies(categorical_df, columns=categorical, drop_first=True)
data_for_grouping = pd.concat([data_encoded,df[numerical]], axis = 1)*1
In [25]:
cosine_sim_matrix = cosine_similarity(data_for_grouping.drop("stroke_1", axis =1)) # exclude stroke from the grouping
In [26]:
figsize = (10, 8)
plt.figure(figsize=figsize)
sns.heatmap(cosine_sim_matrix, annot=False, cmap='viridis')
plt.title('Cosine Similarity Heatmap')
plt.xlabel('Patient')
plt.ylabel('Patient')
plt.show()
No description has been provided for this image
In [27]:
threshold = 0.0 # no filtering used
adjacency_matrix = np.where(cosine_sim_matrix > threshold, cosine_sim_matrix, 0)
graph = nx.from_numpy_array(adjacency_matrix)
In [28]:
# Perform community detection using Louvain
partition = community_louvain.best_partition(graph, random_state=42)
In [29]:
# Add community labels to your DataFrame
numerical_df = numerical_df.assign(community=list(partition.values()))

# Print community distribution
print(numerical_df['community'].value_counts())
community
1    2504
3    2384
4      10
2       9
0       1
Name: count, dtype: int64
In [30]:
community_stats = numerical_df.groupby('community').mean()
print(community_stats)
                 bmi        age  avg_glucose_level
community                                         
0          32.100000  45.000000         106.830000
1          27.238498  30.174121         124.514034
2          28.633333  53.222222         122.570000
3          30.613591  56.185822          85.151674
4          33.670000  31.800000          80.496000
In [31]:
Grouped_df = data_for_grouping.copy()
Grouped_df['community'] =  list(partition.values())
GROUP_2 = Grouped_df[Grouped_df.community.isin([3,4])].drop("community", axis =1)
GROUP_1 = Grouped_df[Grouped_df.community.isin([0,1,2])].drop("community", axis =1)
In [32]:
GROUP_1.describe()
Out[32]:
gender_Male hypertension_1 heart_disease_1 ever_married_Yes work_type_Never_worked work_type_Private work_type_Self-employed work_type_children Residence_type_Urban smoking_status_formerly smoked smoking_status_never smoked smoking_status_smokes stroke_1 bmi age avg_glucose_level
count 2514.000000 2514.00000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.000000 2514.00000 2514.000000
mean 0.435959 0.06961 0.036595 0.431583 0.008751 0.541766 0.093079 0.266905 0.503580 0.118934 0.350040 0.127685 0.031026 27.245426 30.26253 124.500040
std 0.495980 0.25454 0.187803 0.495396 0.093155 0.498352 0.290601 0.442430 0.500087 0.323775 0.477077 0.333805 0.173423 8.109465 21.64740 52.501464
min 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.300000 0.00000 55.120000
25% 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 21.200000 13.25000 85.202500
50% 0.000000 0.00000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 26.100000 26.00000 106.350000
75% 1.000000 0.00000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 31.800000 44.00000 153.292500
max 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 71.900000 82.00000 271.740000
In [33]:
GROUP_2.describe()
Out[33]:
gender_Male hypertension_1 heart_disease_1 ever_married_Yes work_type_Never_worked work_type_Private work_type_Self-employed work_type_children Residence_type_Urban smoking_status_formerly smoked smoking_status_never smoked smoking_status_smokes stroke_1 bmi age avg_glucose_level
count 2394.000000 2394.000000 2394.000000 2394.000000 2394.0 2394.000000 2394.000000 2394.0 2394.000000 2394.000000 2394.000000 2394.000000 2394.000000 2394.000000 2394.00000 2394.000000
mean 0.382206 0.115288 0.063074 0.885129 0.0 0.604845 0.225982 0.0 0.511278 0.224311 0.406015 0.173768 0.054720 30.626358 56.08396 85.132226
std 0.486028 0.319436 0.243147 0.318932 0.0 0.488986 0.418315 0.0 0.499977 0.417215 0.491190 0.378989 0.227481 7.182009 14.54098 18.927130
min 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 11.300000 17.00000 55.220000
25% 0.000000 0.000000 0.000000 1.000000 0.0 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 26.000000 45.00000 71.237500
50% 0.000000 0.000000 0.000000 1.000000 0.0 1.000000 0.000000 0.0 1.000000 0.000000 0.000000 0.000000 0.000000 29.500000 55.00000 82.820000
75% 1.000000 0.000000 0.000000 1.000000 0.0 1.000000 0.000000 0.0 1.000000 0.000000 1.000000 0.000000 0.000000 34.000000 67.00000 95.880000
max 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 1.000000 0.0 1.000000 1.000000 1.000000 1.000000 1.000000 97.600000 82.00000 181.230000

Distances withing groups¶

In [34]:
scaler   = QuantileTransformer(output_distribution='normal', random_state=0)
scaler_2 = QuantileTransformer(output_distribution='normal', random_state=0)

data_scaled_1 = scaler.fit_transform(GROUP_1)
data_scaled_2 = scaler_2.fit_transform(GROUP_2)
In [35]:
# Calculate the covariance matrix and its inverse
cov_matrix = np.cov(data_scaled_1, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Calculate the mean of the data
mean_vector = np.mean(data_scaled_1, axis=0)

md = mahalanobis_distance(data_scaled_1, mean_vector, inv_cov_matrix)
GROUP_1['Mahalanobis_Distance'] = md
In [36]:
# Degrees of freedom is the number of variables
df_variables = data_scaled_1.shape[1]

# Set significance level (e.g., 0.01 for 99% confidence)
alpha = 0.01
threshold = chi2.ppf((1 - alpha), df_variables)
GROUP_1['Mah_Outlier'] = GROUP_1['Mahalanobis_Distance'] > threshold
In [37]:
GROUP_1.Mah_Outlier.value_counts()
Out[37]:
Mah_Outlier
False    2270
True      244
Name: count, dtype: int64
In [38]:
plt.figure(figsize=(12, 6))
plt.title('Mahalanobis Distance')
plt.xlabel('Observation Number')
plt.ylabel('Mahalanobis Distance')
# Scatter plot of Mahalanobis distances
plt.scatter(range(len(GROUP_1)), GROUP_1['Mahalanobis_Distance'], c=GROUP_1['Mah_Outlier'], cmap='coolwarm')
plt.axhline(y=threshold, color='r', linestyle='--')
plt.show()
No description has been provided for this image
In [39]:
# Add the 'Outlier' column back to the standardized data for plotting
data_subset_1 = pd.DataFrame(GROUP_1, columns=numerical_cols)
data_subset_1['Mah_Outlier'] = GROUP_1['Mah_Outlier'].values
sns.pairplot(data_subset_1, hue='Mah_Outlier', diag_kind='kde', markers=["o", "s"])
plt.show()
No description has been provided for this image
In [40]:
lof   = LocalOutlierFactor(n_neighbors=20, contamination=0.02)
lof_2 = LocalOutlierFactor(n_neighbors=20, contamination=0.02)

GROUP_1['LOF_Score'] = lof.fit_predict(data_scaled_1)
GROUP_2['LOF_Score'] = lof_2.fit_predict(data_scaled_2)

# LOF_Score: -1 indicates outlier, 1 indicates inlier
GROUP_1['LOF_Outlier'] = GROUP_1['LOF_Score'] == -1
GROUP_2['LOF_Outlier'] = GROUP_2['LOF_Score'] == -1

GROUP_1.LOF_Outlier.value_counts()
GROUP_2.LOF_Outlier.value_counts()
Out[40]:
LOF_Outlier
False    2346
True       48
Name: count, dtype: int64
In [41]:
plt.figure(figsize=(12, 6))
plt.scatter(GROUP_1['age'], GROUP_1['bmi'], c=GROUP_1['LOF_Outlier'], cmap='coolwarm', edgecolor='k')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.title('Local Outlier Factor Detection')
plt.show()
No description has been provided for this image
In [42]:
plt.figure(figsize=(12, 6))
plt.scatter(GROUP_2['age'], GROUP_2['bmi'], c=GROUP_2['LOF_Outlier'], cmap='coolwarm', edgecolor='k')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.title('Local Outlier Factor Detection')
plt.show()
No description has been provided for this image
In [43]:
iso_forest = IsolationForest(contamination=0.03, random_state=42)
iso_forest_2 = IsolationForest(contamination=0.03, random_state=42)

iso_forest.fit(data_scaled_1)
iso_forest_2.fit(data_scaled_2)

GROUP_1['IF_Score'] = iso_forest.decision_function(data_scaled_1)
GROUP_1['IF_Outlier'] = iso_forest.predict(data_scaled_1) == -1

GROUP_2['IF_Score'] = iso_forest_2.decision_function(data_scaled_2)
GROUP_2['IF_Outlier'] = iso_forest_2.predict(data_scaled_2) == -1
GROUP_1.IF_Outlier.value_counts()
Out[43]:
IF_Outlier
False    2438
True       76
Name: count, dtype: int64
In [44]:
plt.figure(figsize=(12, 6))
plt.scatter(GROUP_1['age'], GROUP_1['bmi'], c=GROUP_1['IF_Outlier'], cmap='coolwarm', edgecolor='k')
plt.xlabel('Average Glucose Level')
plt.ylabel('BMI')
plt.title('Isolation Forest Outlier Detection')
plt.show()
No description has been provided for this image
In [45]:
plt.figure(figsize=(12, 6))
plt.scatter(GROUP_2['age'], GROUP_2['bmi'], c=GROUP_2['IF_Outlier'], cmap='coolwarm', edgecolor='k')
plt.xlabel('Average Glucose Level')
plt.ylabel('BMI')
plt.title('Isolation Forest Outlier Detection')
plt.show()
No description has been provided for this image
In [46]:
outlier_methods = ['Mah_Outlier', 'LOF_Outlier', 'IF_Outlier']
outlier_flags = GROUP_1[outlier_methods]
outlier_flags_2 = GROUP_2[['LOF_Outlier', 'IF_Outlier']]

GROUP_1['Outlier_Score'] = outlier_flags.sum(axis=1)
GROUP_2['Outlier_Score'] = outlier_flags_2.sum(axis=1)

GROUP_1['Multi_Method_Outlier'] = GROUP_1['Outlier_Score'] >= 2  # Adjust threshold as needed
GROUP_2['Multi_Method_Outlier'] = GROUP_2['Outlier_Score'] == 2  # Adjust threshold as needed

num_multi_outliers = GROUP_1['Multi_Method_Outlier'].sum()
num_multi_outliers_2 = GROUP_2['Multi_Method_Outlier'].sum()

print(f"Number of observations flagged by multiple methods for group 1: {num_multi_outliers}")
print(f"Number of observations flagged by multiple methods for Group 2: {num_multi_outliers_2}")

multi_outliers = GROUP_1[GROUP_1['Multi_Method_Outlier']]
multi_outliers_2 = GROUP_2[GROUP_2['Multi_Method_Outlier']]
Number of observations flagged by multiple methods for group 1: 93
Number of observations flagged by multiple methods for Group 2: 1
In [47]:
# Plotting with multi-method outliers highlighted
plt.figure(figsize=(10, 6))
plt.scatter(GROUP_1['age'], GROUP_1['bmi'], c=GROUP_1['Multi_Method_Outlier'], cmap='coolwarm', edgecolor='k')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.title('Outliers Detected by Multiple Methods')
plt.show()
No description has been provided for this image
In [48]:
# Count multi-method outliers with stroke
multi_outliers_with_stroke = GROUP_1[(GROUP_1['Multi_Method_Outlier']) & (GROUP_1['stroke_1'] == 1)]
num_multi_outliers_with_stroke = len(multi_outliers_with_stroke)
print(f"Number of multi-method outliers with stroke: {num_multi_outliers_with_stroke}")

# Compare to total outliers and strokes
total_multi_outliers = GROUP_1['Multi_Method_Outlier'].sum()
total_stroke_cases = GROUP_1['stroke_1'].sum()
print(f"Total multi-method outliers: {total_multi_outliers}")
print(f"Total stroke cases: {total_stroke_cases}")
Number of multi-method outliers with stroke: 49
Total multi-method outliers: 93
Total stroke cases: 78

Classification¶

Group-based analysis¶

Group 1¶

In [49]:
model = LogisticRegression(random_state=42)
scaler = None
sampling = "SMOTE"
fit_and_predict(model, GROUP_1, scaling = StandardScaler(), plots = True, sampling= sampling, topNValues = topNValues)
==================================================
             Fit and Predict Summary              
==================================================

Evaluation Metrics:
--------------------------------------------------
╒═══════════════════╤═════════╕
│ Metric            │   Value │
╞═══════════════════╪═════════╡
│ Class 0           │    0.99 │
├───────────────────┼─────────┤
│ Class 1           │    1    │
├───────────────────┼─────────┤
│ Balanced Accuracy │    1    │
╘═══════════════════╧═════════╛
+ Accuracy for top 10 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 20 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 30 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 40 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 50 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 60 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 70 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 80 percent predictions for Healthy: 100.00, Stroke: 79.31
+ Accuracy for top 90 percent predictions for Healthy: 100.00, Stroke: 79.31
No description has been provided for this image
Permutation Test Results:
+----------+----------------------+-----------+
| Metric   |   Observed Statistic |   p-value |
+==========+======================+===========+
| accuracy |               0.9921 |         0 |
+----------+----------------------+-----------+
| log_loss |              -0.0207 |         0 |
+----------+----------------------+-----------+
| auc      |               0.9992 |         0 |
+----------+----------------------+-----------+
No description has been provided for this image
No description has been provided for this image
In [50]:
fit_and_predict(model, GROUP_1, scaling = StandardScaler(), plots = False, sampling= sampling, topNValues = topNValues, ensemble=None, cv_folds=5, use_cv=True)
==================================================
             Fit and Predict Summary              
==================================================
Cross-validated balanced accuracy: 0.967 ± 0.028

Group 2¶

In [51]:
model = LogisticRegression(random_state=42)
sampling = "ADASYN"
fit_and_predict(model, GROUP_2, scaling = StandardScaler(), plots = True, sampling= sampling, topNValues = topNValues, ensemble=None)
==================================================
             Fit and Predict Summary              
==================================================

Evaluation Metrics:
--------------------------------------------------
╒═══════════════════╤═════════╕
│ Metric            │   Value │
╞═══════════════════╪═════════╡
│ Class 0           │    0.98 │
├───────────────────┼─────────┤
│ Class 1           │    0.97 │
├───────────────────┼─────────┤
│ Balanced Accuracy │    0.98 │
╘═══════════════════╧═════════╛
+ Accuracy for top 10 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 20 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 30 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 40 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 50 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 60 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 70 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 80 percent predictions for Healthy: 100.00, Stroke: 77.55
+ Accuracy for top 90 percent predictions for Healthy: 100.00, Stroke: 77.55
No description has been provided for this image
Permutation Test Results:
+----------+----------------------+-----------+
| Metric   |   Observed Statistic |   p-value |
+==========+======================+===========+
| accuracy |               0.9833 |         0 |
+----------+----------------------+-----------+
| log_loss |              -0.0595 |         0 |
+----------+----------------------+-----------+
| auc      |               0.9956 |         0 |
+----------+----------------------+-----------+
No description has been provided for this image
No description has been provided for this image
In [53]:
fit_and_predict(model, GROUP_2, scaling = StandardScaler(), plots = False, sampling= sampling, topNValues = topNValues, ensemble=None, cv_folds=5, use_cv=True)
==================================================
             Fit and Predict Summary              
==================================================
Cross-validated balanced accuracy: 0.918 ± 0.021

No Grouping nor feature engineering -- original data¶

In [54]:
model = LogisticRegression(random_state=42)
sampling = "SMOTE"
fit_and_predict(model, data_for_grouping, scaling = StandardScaler(), plots = False, sampling= sampling, topNValues = topNValues, ensemble=None)
==================================================
             Fit and Predict Summary              
==================================================

Evaluation Metrics:
--------------------------------------------------
╒═══════════════════╤═════════╕
│ Metric            │   Value │
╞═══════════════════╪═════════╡
│ Class 0           │    0.77 │
├───────────────────┼─────────┤
│ Class 1           │    0.71 │
├───────────────────┼─────────┤
│ Balanced Accuracy │    0.74 │
╘═══════════════════╧═════════╛

Permutation Test Results:
+----------+----------------------+-----------+
| Metric   |   Observed Statistic |   p-value |
+==========+======================+===========+
| accuracy |               0.7658 |         0 |
+----------+----------------------+-----------+
| log_loss |              -0.4698 |         0 |
+----------+----------------------+-----------+
| auc      |               0.8207 |         0 |
+----------+----------------------+-----------+
No description has been provided for this image
No description has been provided for this image
In [55]:
custom_cw = class_weight={0: 1, 1: 70}
model = LogisticRegression(class_weight=custom_cw, random_state=42,max_iter=1000)
scaler = StandardScaler()
sampling = "ADASYN"
fit_and_predict(model, data_for_grouping, scaling = scaler, plots = False, sampling= sampling, topNValues = topNValues, ensemble=None)
==================================================
             Fit and Predict Summary              
==================================================

Evaluation Metrics:
--------------------------------------------------
╒═══════════════════╤═════════╕
│ Metric            │   Value │
╞═══════════════════╪═════════╡
│ Class 0           │    0.38 │
├───────────────────┼─────────┤
│ Class 1           │    0.98 │
├───────────────────┼─────────┤
│ Balanced Accuracy │    0.68 │
╘═══════════════════╧═════════╛

Permutation Test Results:
+----------+----------------------+-----------+
| Metric   |   Observed Statistic |   p-value |
+==========+======================+===========+
| accuracy |               0.406  |         0 |
+----------+----------------------+-----------+
| log_loss |              -2.3315 |         0 |
+----------+----------------------+-----------+
| auc      |               0.8149 |         0 |
+----------+----------------------+-----------+
No description has been provided for this image
No description has been provided for this image
In [56]:
fit_and_predict(model, data_for_grouping, scaling = StandardScaler(), plots = False, sampling= sampling, topNValues = topNValues, ensemble=None, cv_folds=5, use_cv=True)
==================================================
             Fit and Predict Summary              
==================================================
Cross-validated balanced accuracy: 0.742 ± 0.017

Logistic GAM (explored, but never used)¶

In [58]:
numeric_features = ['bmi', 'age', 'avg_glucose_level']

Group 1¶

In [63]:
model_gam = LogisticGAM(f(0) +f(1) +f(2) +f(3) +f(4) +f(5) + f(6) +f(7) +f(8) +f(9) +f(10) +f(11) +s(12, n_splines=10) + s(13, n_splines=10) +s(14, n_splines=10) +f(22))

hey = GROUP_1*1
scaler = StandardScaler()
X = hey.drop(columns=['stroke_1'])
y = hey['stroke_1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)
numeric_features = ['bmi', 'age', 'avg_glucose_level']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.fit_transform(X_test[numeric_features])


model_gam.fit(X_train, y_train)
y_pred = model_gam.predict(X_test)

y_pred_probs = model_gam.predict_proba(X_test)
conf_matrix = plot_confusion_matrix(y_test, y_pred)
conf_matrix

print("ROC AUC Score:", roc_auc_score(y_test, y_pred_probs))
print("Log Loss:", log_loss(y_test, y_pred_probs))
ROC AUC Score: 0.9505226894749347
Log Loss: 0.0764477319654121
No description has been provided for this image
In [64]:
lam_values = np.logspace(-3, 3, 7)  # 7 values from 10^-3 to 10^3
gam = LogisticGAM(f(0) +f(1) +f(2) +f(3) +f(4) +f(5) + f(6) +f(7) +f(8) +f(9) +f(10) +f(11) +s(12, n_splines=5) + s(13, n_splines=5) +s(14, n_splines=5) +f(22))
best_gam = gam.gridsearch(X_train.values, y_train, lam=lam_values)
print(best_gam.summary())
100% (7 of 7) |##########################| Elapsed Time: 0:00:00 Time:  0:00:000:00
LogisticGAM                                                                                               
=============================================== ==========================================================
Distribution:                      BinomialDist Effective DoF:                                     15.9105
Link Function:                        LogitLink Log Likelihood:                                   -88.4675
Number of Samples:                         1759 AIC:                                              208.7559
                                                AICc:                                             209.1038
                                                UBRE:                                               2.1259
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.6385
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
f(0)                              [0.1]                2            2.0          1.59e-01                 
f(1)                              [0.1]                2            1.0          3.01e-01                 
f(2)                              [0.1]                2            1.0          6.44e-05     ***         
f(3)                              [0.1]                2            1.0          8.65e-03     **          
f(4)                              [0.1]                2            0.3          9.75e-01                 
f(5)                              [0.1]                2            1.0          7.07e-01                 
f(6)                              [0.1]                2            0.9          1.04e-02     *           
f(7)                              [0.1]                2            0.1          9.98e-01                 
f(8)                              [0.1]                2            1.0          6.40e-01                 
f(9)                              [0.1]                2            1.0          4.78e-01                 
f(10)                             [0.1]                2            1.0          1.42e-01                 
f(11)                             [0.1]                2            1.0          9.50e-01                 
s(12)                             [0.1]                5            1.2          4.36e-01                 
s(13)                             [0.1]                5            1.3          1.91e-03     **          
s(14)                             [0.1]                5            1.3          3.31e-01                 
f(22)                             [0.1]                2            0.8          3.74e-14     ***         
intercept                                              1            0.0          5.04e-07     ***         
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.
None
/var/folders/3b/1dzvglj17w110wnj0rq1ngkh0000gn/T/ipykernel_1417/3693444541.py:4: UserWarning: KNOWN BUG: p-values computed in this summary are likely much smaller than they should be. 
 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  print(best_gam.summary())
In [65]:
# keep s13, f22, f6, f3, f2

model_gam = LogisticGAM(f(1)+f(2)+ f(3)+s(13, n_splines=10) +f(22))
model_gam.fit(X_train, y_train)
y_pred = model_gam.predict(X_test)

y_pred_probs = model_gam.predict_proba(X_test)
conf_matrix = plot_confusion_matrix(y_test, y_pred)
conf_matrix

print("ROC AUC Score:", roc_auc_score(y_test, y_pred_probs))
print("Log Loss:", log_loss(y_test, y_pred_probs))
ROC AUC Score: 0.9419695889760038
Log Loss: 0.08038423610489502
No description has been provided for this image

Group 2¶

In [66]:
model_gam = LogisticGAM(f(0) +f(1) +f(2) +f(3) +f(4) +f(5) + f(6) +f(7) +f(8) +f(9) +f(10) +f(11) +s(12, n_splines=10) + s(13, n_splines=10) +s(14, n_splines=10) +f(20))
hey = GROUP_2*1
scaler = StandardScaler()
X = hey.drop(columns=['stroke_1'])
y = hey['stroke_1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.fit_transform(X_test[numeric_features])

model_gam.fit(X_train, y_train)
y_pred = model_gam.predict(X_test)

y_pred_probs = model_gam.predict_proba(X_test)
conf_matrix = plot_confusion_matrix(y_test, y_pred)
conf_matrix

print("ROC AUC Score:", roc_auc_score(y_test, y_pred_probs))
print("Log Loss:", log_loss(y_test, y_pred_probs))
ROC AUC Score: 0.6683634992458521
Log Loss: 0.20914310159657315
No description has been provided for this image
In [67]:
lam_values = np.logspace(-3, 3, 7)  # 7 values from 10^-3 to 10^3
gam = LogisticGAM(f(0) +f(1) +f(2) +f(3) +f(4) +f(5) + f(6) +f(7) +f(8) +f(9) +f(10) +f(11) +s(12, n_splines=10) + s(13, n_splines=10) +s(14, n_splines=10) +f(20))
best_gam = gam.gridsearch(X_train.values, y_train, lam=lam_values)
print(best_gam.summary())
100% (7 of 7) |##########################| Elapsed Time: 0:00:00 Time:  0:00:000:00
LogisticGAM                                                                                               
=============================================== ==========================================================
Distribution:                      BinomialDist Effective DoF:                                      4.3357
Link Function:                        LogitLink Log Likelihood:                                  -314.3066
Number of Samples:                         1675 AIC:                                              637.2846
                                                AICc:                                             637.3252
                                                UBRE:                                               2.3825
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.1181
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
f(0)                              [1000.]              2            1.0          9.47e-01                 
f(1)                              [1000.]              2            0.1          1.07e-02     *           
f(2)                              [1000.]              2            0.1          1.81e-01                 
f(3)                              [1000.]              2            0.0          3.43e-01                 
f(4)                              [1000.]              1            0.0          1.00e+00                 
f(5)                              [1000.]              2            0.1          3.24e-01                 
f(6)                              [1000.]              2            0.0          1.25e-01                 
f(7)                              [1000.]              1            0.0          1.00e+00                 
f(8)                              [1000.]              2            0.0          9.97e-01                 
f(9)                              [1000.]              2            0.0          1.58e-01                 
f(10)                             [1000.]              2            0.0          9.47e-01                 
f(11)                             [1000.]              2            0.0          3.91e-01                 
s(12)                             [1000.]              10           1.0          7.25e-01                 
s(13)                             [1000.]              10           0.9          3.68e-09     ***         
s(14)                             [1000.]              10           1.0          6.81e-01                 
f(20)                             [1000.]              2            0.0          2.79e-03     **          
intercept                                              1            0.0          1.33e-15     ***         
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.
None
/var/folders/3b/1dzvglj17w110wnj0rq1ngkh0000gn/T/ipykernel_1417/2327313024.py:4: UserWarning: KNOWN BUG: p-values computed in this summary are likely much smaller than they should be. 
 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  print(best_gam.summary())
In [68]:
# keep s13 and f20 and f1

model_gam = LogisticGAM(f(1)+ s(13, n_splines=10) +f(20))
model_gam.fit(X_train, y_train)
y_pred = model_gam.predict(X_test)
y_pred_probs = model_gam.predict_proba(X_test)
conf_matrix = plot_confusion_matrix(y_test, y_pred)
conf_matrix
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_probs))
print("Log Loss:", log_loss(y_test, y_pred_probs))
ROC AUC Score: 0.6831447963800905
Log Loss: 0.20445880683245488
No description has been provided for this image
In [ ]:
 
In [ ]: